#!/usr/bin/python

import sys
from numpy import prod
reload(sys)  # Reload does the trick!
sys.setdefaultencoding('UTF-8')
import pickle
import unicodecsv as csv
import re
import unidecode

# Need set python path. This is a hack and will
# be fixed!
#sys.path.append('/home/tcoan/git_repos/DynamicNMF/dynamic-nmf')
dynamic_nmf_path = "/home/constantine/Dropbox/cards/baby-cards/dynamic-nmf"
sys.path.append(dynamic_nmf_path)

from text.slices import TimeSlices
from text.prepare import create_text_files
from text.prepare import process_text
import unsupervised
from estimate.window import FitWindow
from estimate.dynamic import FitDynamic

#------------------------------------------------------------------
# Read data and initialize data
        
# Change working directory to make life easier
import os
os.chdir(dynamic_nmf_path)

# Load the sample data. This is the CTT data for baby-cards
with open('./data/skeptics.pkl', 'r') as pfile:
    content = pickle.load(pfile)

# Remove unicode for the semantic coherence measure to work properly.
for row in content:
    row['text_final'] = unicode(unidecode.unidecode(row['text_postprocessed']))

# Get the "time slice" data needed for the DtmNmfModel class
ts = TimeSlices(content, 'date')
time_slice_data = ts.get('text_final')

# Get only data after 2000. Each window needs enough data to
# run or it will error out with  not helpful message!
time_slice_data = [row for row in time_slice_data if row[-1] >= 2007]

#---------------------------------------------------------------------
# Prepare data for nmf

# Create data that dynamicnmf likes
dpaths = create_text_files(time_slice_data, './data', overwrite = True)

# Extract paths
paths = [row[-1] for row in dpaths]

# Process text
texts = process_text(paths)


#---------------------------------------------------------------------
# Estimate topics

# Window topics
ks = [15, 20, 15, 25, 30, 15, 20, 35, 20, 15, 20]
fw = FitWindow(texts, ks)
window_topics = fw.fit()

# Dynamic topics
fd = FitDynamic(window_topics)
fd.fit(20, verbose = True)

# generate dynamic topic keywords and document-topic weight matrix
keywords = fd.get_dynamic_topics(20)
documents = fd.get_document_topics()




